Huffman Encoding

Just a little example of huffman coding

  1. Compute the character frequencies
  2. Build a Huffman Tree using a priority queue
  3. Encode the message using the generate tree
  4. Decode the compressed msg using the tree

In [1]:
from heapq import heapify, heappush, heappop
from collections import defaultdict

global codes
codes = dict()

class HuffNode(object):
    def __lt__(self, other):
        return self.weight < other.weight
    
    def __init__(self, weight, ltr=None):
        self.weight = weight
        self.ltr = ltr
        self.left = None
        self.right = None

def calculate_frequencies(string):
    freqs = defaultdict(int)
    for c in string:
        freqs[c] += 1
    return freqs

def build_tree(frequencies):
    
    heap = []
    for ltr, weight in frequencies.items():
        node = HuffNode(weight, ltr)
        heappush(heap, node)
    
    while len(heap) > 1:
        n1 = heappop(heap)
        n2 = heappop(heap)
        weight = n1.weight + n2.weight
        root = HuffNode(weight)
        root.left = n1
        root.right = n2
        heappush(heap, root)
    tree = heappop(heap)
    return tree

def generate_codes(node, code=''):
    if node.ltr: #leaf
        codes[node.ltr] = (node.weight, code)
    else:
        code += '0'
        generate_codes(node.left, code)
        code = code[:-1]
        
        code += '1'
        generate_codes(node.right, code)
        code = code[:-1]
        
def print_coding_table(code_dict):
    print("Symbol\tWeight\tCode")
    for symbol, code in sorted(code_dict.items(), key=lambda item: item[1][0], reverse=True):
        print(symbol, *code, sep='\t')
        
def encode(code_dict, msg):
    return ''.join([code_dict[ltr][1] for ltr in msg])

def decode(tree, compressed):
    current_node = tree
    uncompressed = ''
    for bit in compressed:
        if bit == '0':
            current_node = current_node.left
        elif bit == '1':
            current_node = current_node.right
        
        if current_node.ltr:
            uncompressed += current_node.ltr
            current_node = tree 
    return uncompressed

test_string = 'this is an example for huffman encoding'
freqs = calculate_frequencies(test_string)
tree = build_tree(freqs)
generate_codes(tree)
print_coding_table(codes)
compressed = encode(codes, test_string)

print("\nCompressed:\n", compressed, sep='')
decompressed = decode(tree, compressed)
print("\nDecompressed:\n", decompressed, sep='')


Symbol	Weight	Code
 	6	110
n	4	000
a	3	1011
f	3	1010
i	3	1111
e	3	1110
m	2	0110
o	2	10011
s	2	0011
h	2	0010
t	1	10010
l	1	01001
r	1	10000
d	1	01111
g	1	01110
x	1	01010
u	1	01011
p	1	10001
c	1	01000

Compressed:
1001000101111001111011110011110101100011011100101010110110100010100111101101010100111000011000100101110101010011010110001101110000010001001101111111100001110

Decompressed:
this is an example for huffman encoding